import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd
import pickle
from decision_company import read_csv_file, sum_up, is_null

atp_tennis = read_csv_file(os.path.join(sys.argv[1], 'atp_tennis.csv'))

missing_values = sum_up(is_null(atp_tennis))

print(missing_values)
# pickle.dump(missing_values,open("./ref_result/missing_values.pkl","wb"))

import pandas as pd
import pickle
from decision_company import read_csv_file, n_unique


unique_values = n_unique(atp_tennis, ['Tournament', 'Series', 'Court', 'Surface', 'Round'])

print(unique_values)
# pickle.dump(unique_values,open("./ref_result/unique_values.pkl","wb"))

import pandas as pd
import pickle
from decision_company import read_csv_file, create_dataframe


data_quality_report = create_dataframe({'Missing Values': missing_values, 'Unique Values': unique_values})

print(data_quality_report)
# pickle.dump(data_quality_report,open("./ref_result/data_quality_report.pkl","wb"))

import pandas as pd
import pickle
from decision_company import read_csv_file, count_unique_values, transform, generate_summary_stat, concatenate_objects, create_dataframe, generate_summary_stat


# Count the number of wins for each player:
player_wins = count_unique_values(atp_tennis['Winner'])

# Count the total number of matches played by each player:
player_matches = count_unique_values(atp_tennis['Player_1']) + count_unique_values(atp_tennis['Player_2'])


# Calculate win/loss ratio for each player:
win_loss_ratios = player_wins / (player_matches - player_wins)

# Add win/loss ratios to the dataset:
atp_tennis['Win_Loss_Ratio_1'] = transform(atp_tennis['Player_1'], win_loss_ratios)
atp_tennis['Win_Loss_Ratio_2'] = transform(atp_tennis['Player_2'], win_loss_ratios)


# Descriptive Statistics Table:
desc_stats = generate_summary_stat(atp_tennis[['Rank_1', 'Rank_2', 'Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']])

# Combine Rank_1, Rank_2, Win_Loss_Ratio_1, and Win_Loss_Ratio_2 into single columns:
combined_ranks = concatenate_objects(atp_tennis['Rank_1'], atp_tennis['Rank_2'])
combined_win_loss_ratios = concatenate_objects(atp_tennis['Win_Loss_Ratio_1'], atp_tennis['Win_Loss_Ratio_2'])

# Create a DataFrame with combined ranks and win/loss ratios:
combined_data = create_dataframe({'Rank': combined_ranks, 'Win_Loss_Ratio': combined_win_loss_ratios})

print(combined_data)
# pickle.dump(combined_data,open("./ref_result/combined_data.pkl","wb"))

import pandas as pd
import numpy as np
import pickle
from decision_company import read_csv_file, generate_summary_stat


# Descriptive Statistics Table:
desc_stats = generate_summary_stat(combined_data)

print(desc_stats)
# pickle.dump(desc_stats,open("./ref_result/desc_stats.pkl","wb"))

import pandas as pd
import matplotlib.pyplot as plt 
import pickle
from decision_company import read_csv_file, create_subplots, create_histogram_subplot, set_plot_split_title, make_xlabel, save_plot, show_plots


# Histogram for Player Rankings Distribution:
fig1, ax1 = create_subplots()
create_histogram_subplot(ax1, combined_ranks, bins=50, alpha=0.5)
set_plot_split_title(ax1, 'Player Rankings Distribution')
make_xlabel(ax1, 'Ranking')
save_plot('./ref_result/hist_chart.png')
# show_plots() 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from decision_company import read_csv_file, create_subplots, set_plot_split_title, show_plots, make_xlabel, save_plot, create_scatter_plot, make_ylabel, show_plots


# Scatter Plot for Player Rankings vs Win/Loss Ratios:
fig2, ax2 = create_subplots()
create_scatter_plot(ax2, combined_data['Rank'], combined_data['Win_Loss_Ratio'], alpha=0.5)
set_plot_split_title(ax2, 'Player Rankings vs Win/Loss Ratios')
make_xlabel(ax2, 'Ranking')
make_ylabel(ax2, 'Win/Loss Ratio')
save_plot('./ref_result/scatter_chart.png')
# show_plots()


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from decision_company import read_csv_file, count_unique_values, create_subplots, set_plot_split_title, show_plots, make_xlabel, save_plot, make_ylabel, create_bar_chart, show_plots


# Bar Chart for Surface Types:
fig3, ax3 = create_subplots()
surface_counts = count_unique_values(atp_tennis['Surface'])
create_bar_chart(ax3, surface_counts)
set_plot_split_title(ax3, 'Matches by Surface Type')
make_xlabel(ax3, 'Surface')
make_ylabel(ax3, 'Number of Matches')
save_plot('./ref_result/bar_chart.png')
# show_plots()


import pandas as pd
import numpy as np
import pickle
from decision_company import read_csv_file, df_copy


# Create a DataFrame with surface types, player rankings, and win/loss ratios
surface_data = df_copy(atp_tennis[['Surface', 'Rank_1', 'Rank_2', 'Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']])

print(surface_data)
# pickle.dump(surface_data,open("./ref_result/surface_data.pkl","wb"))

import pandas as pd
import pickle
from decision_company import read_csv_file, df_copy, dropna


# Remove rows with missing or invalid data in the Win_Loss_Ratio columns
surface_data_clean = df_copy(dropna(surface_data, subset_columns=['Win_Loss_Ratio_1', 'Win_Loss_Ratio_2']))

print(surface_data_clean)
# pickle.dump(surface_data_clean,open("./ref_result/surface_data_clean.pkl","wb"))

import pandas as pd
import pickle
from decision_company import read_csv_file, get_dummies


# One-hot encoding for surface types
surface_data_clean = get_dummies(surface_data_clean, columns=['Surface'], prefix='', prefix_sep='')

print(surface_data_clean)
# pickle.dump(surface_data_clean,open("./ref_result/surface_data_clean.pkl","wb"))

